set.seed(2)

required_packages <- c("tidyverse", "magrittr", "DBI", "bigrquery", "arrow","glue", "vroom","janitor", "gt", "ggwordcloud", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools", "showtext", "pander", "ggridges")
for(i in required_packages) { 
  if(!require(i, character.only = T)) {
    #  if package is not existing, install then load the package
    install.packages(i, dependencies = T)
  require(i, character.only = T)
  }
}

panderOptions('table.alignment.default', "left")

## quality of png's
dpi <- 750

## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))


theme_update(plot.margin = margin(30, 30, 30, 30),
             plot.background = element_rect(color = "white",
                                            fill = "white"),
             plot.title = element_text(size = 20,
                                       face = "bold",
                                       lineheight = 1.05,
                                       hjust = .5,
                                       margin = margin(10, 0, 25, 0)),
             plot.title.position = "plot",
             plot.caption = element_text(color = "grey40",
                                         size = 9,
                                         margin = margin(20, 0, -20, 0)),
             plot.caption.position = "plot",
             axis.line.x = element_line(color = "black",
                                        size = .8),
             axis.line.y = element_line(color = "black",
                                        size = .8),
             axis.title.x = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(t = 20)),
             axis.title.y = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(r = 20)),
             axis.text = element_text(size = 11,
                                      color = "black",
                                      face = "bold"),
             axis.text.x = element_text(margin = margin(t = 10)),
             axis.text.y = element_text(margin = margin(r = 10)),
             axis.ticks = element_blank(),
             panel.grid.major.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.major.y = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.y = element_blank(),
             panel.spacing.x = unit(4, "lines"),
             panel.spacing.y = unit(2, "lines"),
             legend.position = "top",
             legend.title = element_text(family = "Montserrat",
                                         color = "black",
                                         size = 14,
                                         margin = margin(5, 0, 5, 0)),
             legend.text = element_text(family = "Montserrat",
                                        color = "black",
                                        size = 11,
                                        margin = margin(4.5, 4.5, 4.5, 4.5)),
             legend.background = element_rect(fill = NA,
                                              color = NA),
             legend.key = element_rect(color = NA, fill = NA),
             #legend.key.width = unit(5, "lines"),
             #legend.spacing.x = unit(.05, "pt"),
             #legend.spacing.y = unit(.55, "pt"),
             #legend.margin = margin(0, 0, 10, 0),
             strip.text = element_text(face = "bold",
                                       margin = margin(b = 10)))

## theme settings for flipped plots
theme_flip <-
  theme(panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_line(size = .6,
                                          color = "#eaeaea"))

## theme settings for maps
theme_map <- 
  theme_void(base_family = "Montserrat") +
  theme(legend.direction = "horizontal",
        legend.box = "horizontal",
        legend.margin = margin(10, 10, 10, 10),
        legend.title = element_text(size = 17, 
                                    face = "bold"),
        legend.text = element_text(color = "grey33",
                                   size = 12),
        plot.margin = margin(15, 5, 15, 5),
        plot.title = element_text(face = "bold",
                                  size = 20,
                                  hjust = .5,
                                  margin = margin(30, 0, 10, 0)),
        plot.subtitle = element_text(face = "bold",
                                     color = "grey33",
                                     size = 17,
                                     hjust = .5,
                                     margin = margin(10, 0, -30, 0)),
        plot.caption = element_text(size = 14,
                                    color = "grey33",
                                    hjust = .97,
                                    margin = margin(-30, 0, 0, 0)))

## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)

## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")

## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")

## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))

## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)

Read data

df <- bind_rows(
    pmap_df(list(0:99), ~read_csv(glue("../proc_data/ahref/export_keywords_100_to_500/{.x}.csv"))) %>% 
        clean_names() %>% mutate(cat = "100-500"),
    pmap_df(list(0:99), ~read_csv(glue("../proc_data/ahref/export_keywords_500_to_1000/{.x}.csv"))) %>% 
        clean_names() %>% mutate(cat = "500-1000"),
    pmap_df(list(0:49), ~read_csv(glue("../proc_data/ahref/export_keywords_1000_to_10000/{.x}.csv"))) %>% 
        clean_names() %>% mutate(cat = "1000-10000"),
    pmap_df(list(0:33), ~read_csv(glue("../proc_data/ahref/export_keywords_10000_to_1000000000/{.x}.csv"))) %>% 
            clean_names() %>% mutate(cat = "10000+")
)

df %<>% mutate(
  difficulty_cat = case_when(
    difficulty <= 10 ~ "Easy",
    between(difficulty, 11, 30) ~ "Medium",
    between(difficulty, 31, 70) ~ "Hard",
    between(difficulty, 71, 100) ~ "Super hard"
  ))
  
dfs <- df %>% sample_n(20000) 

tibble("Number of rows" = format(df %>% nrow(), big.mark = ",")) %>% 
  pander()
Number of rows
2,507,759


Preamble


The volume looks weird. It doesn’t follow the initial categories:

df %>% drop_na(volume) %>% 
    group_by(cat) %>% 
    summarise(
        n = n(),
        mean = mean(volume),
        median = median(volume),
        max = max(volume),
        min = min(volume)
        ) %>% 
    arrange(median) %>% 
    pander()
cat n mean median max min
100-500 714519 291.3 100 7330000 10
500-1000 776898 407.7 200 5750000 10
1000-10000 402317 1064 300 5310000 10
10000+ 272614 13835 800 45260000 10

Let’s look at some of the searches that had a low search volume in the initial data set:

df %>% filter(cat == "100-500", volume > 10000) %>% 
    select(keyword, volume) %>% 
    head(5) %>% 
    pander()
keyword volume
new orleans 336000
beauty 135000
mine 84000
stupid 83000
newport news 32000

This doesn’t looks like low volume words. And indeed they were not in the original data sets for 100-500 volume. I’m not sure how they came in.

This brings up the issue of upscaling.

For example, let’s say we are interested in the average keyword difficulty. We could simply take the average keyword difficulty of all the keywords in these exported_keyword datasets. But the representation in this data set is skewed, so that the keywords with high search volume are vastly over-represented, compared to the list of words in the original data set.

A more correct approach would be to weight them by how large a pool they are a selection from. Eg, if there are 100 keywords that are not included for each keyword in the 100-500 volume category, but only 1 keyword that was not included for each keyword in the 10000+ volume category, they would be assigned accordingly.

In fact, let’s see:

length_keyword_files <- function(min, max){
  sql <- glue("SELECT count(*) as `count`
          FROM `dataforseo-bigquery.dataforseo_data.keyword_data` 
          WHERE location = 2840 
          AND keyword_info_search_volume >= {min}
          AND keyword_info_search_volume < {max}")
  tb <- bq_project_query("dataforseo-bigquery", sql)
  df <- bq_table_download(tb) %>% mutate(min = min, max = max)
}

scaling <- map2_df(c(100, 500, 1000, 10000), c(500, 1000, 10000, 1000000000), length_keyword_files) %>% 
    mutate(factor = count / 1000000) %>% relocate(min, max)

scaling %>% pander()
min max count factor
100 500 33059825 33.06
500 1000 5766061 5.766
1000 10000 8449417 8.449
10000 1e+09 1284194 1.284


For now I will just use this list as if its truth and not adjust for volume. This can be changed later.


Keyword difficulty

tribble(~Mean, ~Median,
        round(mean(df$difficulty, na.rm = T), 2), median(df$difficulty, na.rm = T)) %>% 
  pander()
Mean Median
14.33 6
dfs %>%
  ggplot(aes(x = difficulty, y = volume)) +
  geom_jitter(size = 0.1, alpha = 0.1, height = 0.08, width = 0.3) +
  scale_y_log10(labels = comma) +
  geom_smooth(method='loess', formula= y~x) +
  labs(title = "Keyword difficulty and volume")

dfs %>% drop_na(difficulty_cat) %>% 
  ggplot(aes(y = volume, x = difficulty_cat)) +
  geom_violin(draw_quantiles = c(0.5)) +
  scale_y_log10(labels = comma) +
  labs(x = "Difficulty category", title = "Keyword difficulty and volume")

dfs %>%
  ggplot(aes(x = difficulty, y = cpc)) +
  geom_jitter(size = 0.1, alpha = 0.1, height = 0.08, width = 0.3) +
  scale_y_log10(labels = comma) +
  geom_smooth(method='loess', formula= y~x) +
  labs(title = "Keyword difficulty and cpc")

dfs %>% drop_na(difficulty_cat) %>% 
  ggplot(aes(y = cpc, x = difficulty_cat)) +
  geom_violin(draw_quantiles = c(0.5)) +
  scale_y_log10(labels = comma) +
  labs(x = "Difficulty category", title = "Keyword difficulty and cpc")


SERP features

dff <- dfs %>% 
  select(keyword, volume, clicks, cpc, serp_features, cps) %>% 
  separate_rows(serp_features, sep = ",") %>% 
  mutate(serp_features = ifelse(is.na(serp_features), "(None)", serp_features))

nones <- dff %>% filter(serp_features == "(None)")

dffn <- dff %>% group_by(keyword) %>% 
  summarise(n_serp = n()) %>% 
  mutate(n_serp = ifelse(keyword %in% nones$keyword, 0, n_serp)) %>% 
  mutate(n_serp = ifelse(n_serp >= 6, "6+", as.character(n_serp))) %>% 
  mutate(n_serp = factor(n_serp, levels = c("0", "1", "2", "3", "4", "5", "6+")))

dffn <- left_join(dffn, dfs, by = "keyword")
dff %>% group_by(serp_features) %>% 
  summarise(prop = n() / nrow(dfs)) %>% 
  ggplot(aes(y = reorder(serp_features, prop), x = prop)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  scale_x_continuous(labels = scales::percent) +
  labs(y = "SERP feature", x = "", title = "Presence of SERP features")

dffn %>% group_by(n_serp) %>%
  summarise(n = n() / nrow(dffn)) %>% 
  ggplot(aes(x = n_serp, y = n)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  labs(x = "Number of serp features", y = "", title = "Distribution of SERP features") +
  scale_y_continuous(labels = scales::percent)

order <- dff %>% 
  group_by(serp_features) %>%
  summarise(volume = mean(volume, na.rm = T)) %>% 
  arrange(volume) %>% 
  pull(serp_features)


dff %>% 
  drop_na(serp_features) %>% 
  mutate(serp_features = factor(serp_features, levels = order)) %>% 
  ggplot(aes(y = serp_features, x = volume)) +
  stat_density_ridges(fill = "turquoise4", color = "black") +
  scale_x_log10(limits = c(10, 50000)) +
  labs(title = "SERP features and mean volume", x = "Mean volume")

dffn %>% group_by(n_serp) %>%
  summarise(volume = mean(volume, na.rm = T)) %>% 
  ggplot(aes(x = n_serp, y = volume)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  labs(x = "Number of serp features", title = "Number of SERP features and mean volume", y = "Mean volume")

dffn %>% 
  ggplot(aes(y = n_serp, x = volume)) +
  stat_density_ridges(fill = "turquoise4", color = "black") +
  scale_x_log10(labels = comma) +
  labs(y = "SERP features", title = "Number of SERP features and volume")

order <- dff %>% 
  group_by(serp_features) %>%
  summarise(mean_clicks = mean(clicks, na.rm = T)) %>% 
  arrange(mean_clicks) %>% 
  pull(serp_features)


dff %>% 
  mutate(serp_features = factor(serp_features, levels = order)) %>% 
  mutate(clicks = clicks + 1) %>% 
  ggplot(aes(y = serp_features, x = clicks)) +
  stat_density_ridges(fill = "turquoise4", color = "black") +
  scale_x_log10(labels = comma)

dffn %>% group_by(n_serp) %>%
  summarise(clicks = mean(clicks, na.rm = T)) %>% 
  ggplot(aes(x = n_serp, y = clicks)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  labs(x = "SERP features", title = "Number of SERP features and mean clicks", y = "Mean clicks")

dffn %>% 
  mutate(clicks = clicks + 1) %>% 
  ggplot(aes(y = n_serp, x = clicks)) +
  stat_density_ridges(fill = "turquoise4", color = "black") +
  scale_x_log10(labels = comma) +
  labs(y = "SERP features", title = "Number of SERP features and clicks")


Global volume

dfv <- 
  bind_rows(
    df %>% mutate(region = "US"),
    df %>% mutate(volume = global_volume - volume, region = "International")
    )

International searches have much more total volume:

dfv %>% group_by(region) %>% 
  summarise(volume = format(sum(volume, na.rm = T), big.mark = ",")) %>% 
  pander()
region volume
International 10,042,589,870
US 4,724,571,610

Internationally there are more searches with very low volume, while US has more searches with medium volume.

dfv %>% drop_na(volume) %>% 
  mutate(volume_group = case_when(volume < 100 ~ "< 100",
    between(volume, 100, 1000) ~ "100 - 1000",
    between(volume, 1000, 10000) ~ "1000 - 10,000",
    volume > 10000 ~ "10,000 +")) %>% 
  mutate(volume_group = factor(volume_group, levels = c("< 100", "100 - 1000", "1000 - 10,000", "10,000 +"))) %>% 
  group_by(volume_group, region) %>% 
  summarise(n = n()) %>% 
  ggplot(aes(x = volume_group, y = n, fill = region)) +
  geom_bar(stat = "identity", position = position_dodge(), width = 0.8)

There is not a large difference in the number of searches with very high volume. However, the total volume of these searches is a lot higher internationally

dfv %>% drop_na(volume) %>% 
  mutate(volume_group = case_when(volume < 100 ~ "< 100",
    between(volume, 100, 1000) ~ "100 - 1000",
    between(volume, 1000, 10000) ~ "1000 - 10,000",
    between(volume, 10000, 100000) ~ "10,000 - 100,000",
    between(volume, 100000, 1000000) ~ "100,000 - 1M",
    volume > 1000000 ~ "1M +")) %>% 
  mutate(volume_group = factor(volume_group, levels = c("< 100", "100 - 1000", "1000 - 10,000", "10,000 - 100,000", "100,000 - 1M", "1M +"))) %>% 
  group_by(volume_group, region) %>% 
  summarise(n = sum(volume)) %>% 
  ggplot(aes(x = volume_group, y = n, fill = region)) +
  geom_bar(stat = "identity", position = position_dodge(), width = 0.8)

Looking at the number of searches with above 1 million volume:

df_int <- df %>% mutate(international_volume = global_volume - volume) %>% 
  filter(international_volume > 0, global_volume > 0) %>% 
  mutate(volume_diff = log10(international_volume) - log10(volume))

tibble(International = df_int %>% filter(international_volume > 1000000) %>% nrow(),
       US = df_int %>% filter(volume > 1000000) %>% nrow()) %>% 
  pander()
International US
1139 237
df_int_s <- df_int %>% sample_n(200000)

df_int_s %>% 
  ggplot(aes(x = volume, y = international_volume)) +
  geom_jitter(size = 0.05, alpha = 0.02, height = 0.15, width = 0.15) +
  scale_x_log10(labels = comma, expand = c(0,0)) +
  scale_y_log10(labels = comma, expand = c(0,0)) +
  geom_abline(intercept = 0, slope = 1, color = "turquoise4", size = 1) +
  labs(x = "US volume", y = "International volume")


We can see that they mostly follow each other, but there are some searches with large difference between them.

Higher volume internationally:

df_int %>% 
  arrange(desc(volume_diff)) %>% 
  filter(volume != 60) %>% 
  select(keyword, us_volume = volume, international_volume) %>% 
  head(5) %>% 
  pander()
keyword us_volume international_volume
filmoviplex 10 295990
cloroquina 200 5869800
parivahan sewa 10 276990
jokaroom 10 173990
handball em 20 327980


Higher volume in US:

df_int %>% 
  arrange(volume_diff) %>% 
  select(keyword, us_volume = volume, international_volume) %>% 
  head(5) %>% 
  pander()
keyword us_volume international_volume
football playoff schedule 602000 1000
frontier mail 586000 1000
spectrum mobile 526000 1000
chase bank near me 523000 1000
spectrum internet 998000 2000


Searches that have higher volume in US have a higher click-per-search on average than searches that have higher volume internationally.

df_int_s %>% 
  ggplot(aes(x = volume_diff, y = cps)) +
  geom_point(alpha = 0.08, size = 0.08) +
  #scale_y_log10(labels = comma) +
  geom_smooth(method='lm', formula= y~x) +
  scale_x_continuous(breaks = c(-2, 0, 3), labels = c("More US", "0", "More international")) +
  labs(x = "")


They also have a higher cost-per-click on average

df_int_s %>% 
  ggplot(aes(x = volume_diff, y = cpc)) +
  geom_point(alpha = 0.08, size = 0.08) +
  scale_y_log10(labels = comma) +
  geom_smooth(method='lm', formula= y~x) +
  scale_x_continuous(breaks = c(-2, 0, 3), labels = c("More US", "0", "More international")) +
  labs(x = "")


Searches that have higher volume internationally, tend to have higher difficulty

df_int_s %>% 
  ggplot(aes(x = volume_diff, y = difficulty)) +
  geom_point(alpha = 0.08, size = 0.08) +
  #scale_y_log10(labels = comma) +
  geom_smooth(method='lm', formula= y~x) +
  scale_x_continuous(breaks = c(-2, 0, 3), labels = c("More US", "0", "More international")) +
  labs(x = "")


Clicks

tribble(~Mean, ~Median,
        round(mean(df$clicks, na.rm = T), 2), median(df$clicks, na.rm = T)) %>% 
  pander()
Mean Median
3125 307
dfs %>% ggplot(aes(x = clicks)) +
  geom_histogram(fill = "turquoise4", color = "black") +
  scale_x_log10(labels = comma) +
  labs(title = "Distribution of number of clicks per search", y = "", x = "")

!!!J: I don’t think there is any data in the file about organic / paid?